Exploring the Arts Council National Portfolio 2018-2022
On 27 June 2017, the Arts Council England announced the National Portfolio Organisations for the next four years. They also provided a factsheet and a dataset!
Now we've explored and had a stab at tidying the dataset, it's time to revisit it systematically.
Author: Edafe Onerhime
Created: 2018-08-04
Last Updated: 2018-08-05
Description: Systematic exploration of the original dataset and the tidy datasets: funding & portfolio.
Contents: Notebook folder with this Jupyter Notebook, data folder, research folder (not sync'd to GitHub)
Notes:
2018-08-05 - Changed direction with this notebook. Initially it was a loose exploration tool, now I'm using it to explore processes and artefacts that speed up systematic data analysis. This notebook will chop and change frequently!
In [1]:
%matplotlib inline
import os.path
import pandas as pd
import numpy as np
import missingno as msno
from IPython.core.display import display, HTML
from urllib.request import urlretrieve
from tabulate import tabulate
In [2]:
"""
Making 'global' variables we'll re-use later easier to set
"""
originalFile = {'fname': 'NPO_successful_20182022_July_6.xlsx',
'url': 'http://www.artscouncil.org.uk/sites/default/files/download-file/NPO_successful_20182022_July_6.xlsx'}
dataFolderOut = os.path.join(os.pardir,'data','output')
dataFolderIn = os.path.join(os.pardir,'data','input')
In [3]:
"""
All the functions we'll need
"""
def customInfo(dfIn):
"""
Summary text and table based on Dataframe.info()
"""
df = dfIn.copy(deep=True)
df.df_name = dfIn.df_name
CI = 'The dataset ' + "'" + df.df_name + "'" + \
' has ' + str(df.shape[1]) + ' columns and ' + str(df.shape[0]) + ' rows. Columns: '
s = df.get_dtype_counts()
for ind, val in s.iteritems():
CI = CI + str(val) + ' ' + ind +', '
CI = CI[:-2]+'.'
dfCI = pd.DataFrame()
df.reset_index(inplace=True)
dfCI['count'] = df.count()
dfCI['unique'] = df.T.apply(lambda x: x.nunique(), axis=1)
dfCI['min'] = df.min()
dfCI['max'] = df.max()
dfCI['median'] = df.median()
dfCI['range'] = dfCI['max'].apply(pd.to_numeric, errors='coerce') - dfCI['min'].apply(pd.to_numeric, errors='coerce')
try:
dfCI['iqr'] = df.quantile(0.75) - df.quantile(0.25)
except:
dfCI['iqr'] = np.nan
dfCI.sort_index(inplace=True)
return dfCI, CI
def portfolioTransform(dfIn, ColumnName, Year, Period, Type):
"""
Returns a dataframe with Portfolio Funding amount, Year e.g. 2015 and Period e.g. 2015-2018
"""
df = dfIn[[ColumnName]].copy(deep=True)
df['Year'] = Year
df['Period'] = Period
df['Type'] = Type
df.rename(columns={ColumnName:'Amount'}, inplace=True)
return df
def describeCategory(dfIn, measure=''):
"""
Returns an array of dataframes showing count and proportion of each category
(1 dataframe per category column)
When a measure is provided, use the measure instead of the count
"""
df = dfIn.copy(deep=True).reset_index()
dataframes = []
i = 0
if len(measure)==0:
for catCol in list(df.select_dtypes(include=['category']).columns):
dataframes.append(pd.DataFrame(df[catCol].value_counts()).reset_index().rename(columns={catCol: 'Count', 'index': catCol}))
dataframes[i]['%'] = dataframes[i][['Count']]/df.shape[0]
dataframes[i].df_name = catCol
i+= 1
else:
for catCol in list(df.select_dtypes(include=['category']).columns):
dataframes.append(pd.DataFrame(df.reset_index().groupby(catCol)[measure].agg('sum').reset_index()))
dataframes[i]['%'] = dataframes[i][[measure]]/df[measure].sum()
dataframes[i].df_name = catCol
i+= 1
return dataframes
In [4]:
"""
Load datasets we'll use later
"""
# Fetch/Load Original
fname = os.path.join(dataFolderIn,originalFile['fname'])
if not os.path.isfile(fname):
print('Downloading:',fname)
url = originalFile['url']
urlretrieve(url, fname)
df = pd.read_excel(open(fname,'rb'), sheetname=0)
df.df_name = 'Original'
"""
Get summary information
"""
dfInfo, infoText = customInfo(df)
print(infoText)
dfInfo
Out[4]:
In [5]:
"""
Tidy and enrich the original dataset
"""
# Strip out carriage returns in column names
df = df.rename(columns={col: col.replace('\n','').replace(' ',' ') for col in df.columns})
df.df_name = 'All'
# Strip out text value 'NEW' so column is all numbers
df.loc[df['% Cash change between 17/18 and 18/19'].str.contains('NEW', na=False), '% Cash change between 17/18 and 18/19'] = np.NaN
# Make these columns categories
catCols = ['Funding Band', 'Area', 'ACE Region', 'ONS Region', 'Discipline', 'Funding Source (GIA or LOT)']
df[catCols] = df[catCols].apply(pd.Categorical)
# Make this column true or false (Bool)
df.loc[df['Portfolio funded in 2015-18?'].str.contains('Yes', na=False), 'Portfolio funded in 2015-18?'] = True
df.loc[df['Portfolio funded in 2015-18?'].str.contains('No', na=False), 'Portfolio funded in 2015-18?'] = False
df['Portfolio funded in 2015-18?'] = df['Portfolio funded in 2015-18?'].astype('bool')
# Make these columns ints
intCols = ['Portfolio funding 15/16 - £', 'Portfolio funding 16/17 - £', 'Portfolio funding 17/18 - £',
'TOTAL Portfolio funding 15/18 - £']
df[intCols] = df[intCols].astype('int')
# Enrich: Funding Source (GIA or LOT) using descriptions here: https://www.artsprofessional.co.uk/news/npo-scheme-merge-ace-grant-aid-and-lottery-funding
df['Funding Source Description'] = df['Funding Source (GIA or LOT)'].astype(str)
df.loc[df['Funding Source Description'].str.contains('GIA', na=False), 'Funding Source Description'] = 'Grant in Aid'
df.loc[df['Funding Source Description'].str.contains('LOT', na=False), 'Funding Source Description'] = 'National Lottery'
# Enrich: Funding Bands using descriptions here: http://www.artscouncil.org.uk/sites/default/files/download-file/Briefing_current_NPOs_MPMs_FINAL.pptx
df['Funding Band Description'] = df['Funding Band'].astype(str)
df.loc[~df['Funding Band Description'].str.contains('SSO'), 'Funding Band Description'] = 'Band ' + df['Funding Band Description'].astype(str)
df.loc[df['Funding Band Description'].str.contains('SSO'), 'Funding Band Description'] = 'Sector Support Organisation'
# Pull information out of notes: Bridge Organisation, Uplift, Reduction, Museum Development
df['Bridge Organisation'] = False
df['Technical Uplift'] = False
df['Technical Reduction'] = False
df['Museum Development'] = False
df.loc[df['Notes'].str.lower().str.contains('bridge organisation', na=False), 'Bridge Organisation'] = True
df.loc[df['Notes'].str.lower().str.contains('uplift', na=False), 'Technical Uplift'] = True
df.loc[df['Notes'].str.lower().str.contains('reduction', na=False), 'Technical Reduction'] = True
df.loc[df['Notes'].str.lower().str.contains('museum development', na=False), 'Museum Development'] = True
# Add Index (Looks like Applicant Name & Funding Band are unique)
df.set_index(['Applicant Name', 'Funding Band'], inplace=True)
"""
Split into Applicant, Classification, Portfolio and save as csv
"""
dfApplicant = df[['Alternative Name', 'Web address', 'Discipline', 'Notes', 'Area', 'ACE Region', 'ONS Region', 'Local Authority']]
dfApplicant.df_name = 'Applicant'
dfClassification = df[['Funding Source (GIA or LOT)', 'Funding Source Description', 'Funding Band Description', 'Portfolio funded in 2015-18?', 'Bridge Organisation', 'Technical Uplift', 'Technical Reduction', 'Museum Development']]
dfClassification.df_name = 'Classification'
dfPortfolio = portfolioTransform(df,'Portfolio funding 15/16 - £','2015','2015-2018','Fund')
dfPortfolio = dfPortfolio.append(portfolioTransform(df,'Portfolio funding 16/17 - £','2016','2015-2018','Fund'))
dfPortfolio = dfPortfolio.append(portfolioTransform(df,'Portfolio funding 17/18 - £','2017','2015-2018','Fund'))
dfPortfolio = dfPortfolio.append(portfolioTransform(df,'Portfolio grant 18/19 - £','2018','2018-2022','Grant'))
dfPortfolio = dfPortfolio.append(portfolioTransform(df,'Portfolio grant 19/20 - £','2019','2018-2022','Grant'))
dfPortfolio = dfPortfolio.append(portfolioTransform(df,'Portfolio grant 20/21 - £','2020','2018-2022','Grant'))
dfPortfolio = dfPortfolio.append(portfolioTransform(df,'Portfolio grant 21/22 - £','2021','2018-2022','Grant'))
# Make Period, Type categories then add Year to index
catCols = ['Year','Period','Type']
dfPortfolio[catCols] = dfPortfolio[catCols].apply(pd.Categorical)
dfPortfolio.set_index('Year', append=True, inplace=True)
dfPortfolio.df_name = 'Portfolio'
# Save csv
filePrefix = 'NPO_2018-22_'
df.to_csv(os.path.join(dataFolderOut,filePrefix+'all.csv'))
dfApplicant.to_csv(os.path.join(dataFolderOut,filePrefix+'applicant.csv'))
dfPortfolio.to_csv(os.path.join(dataFolderOut,filePrefix+'portfolio.csv'))
dfClassification.to_csv(os.path.join(dataFolderOut,filePrefix+'classification.csv'))
In [6]:
"""
Get summary information
"""
dfInfo, infoText = customInfo(df)
print(infoText)
dfInfo, infoText = customInfo(dfApplicant)
print(infoText)
dfInfo, infoText = customInfo(dfPortfolio)
print(infoText)
dfInfo, infoText = customInfo(dfClassification)
print(infoText)
In [7]:
"""
Landscape: Check for missing values using MissingNo
https://github.com/ResidentMario/missingno
"""
msno.matrix(df)
msno.heatmap(df)
msno.bar(df)
msno.dendrogram(df)
In [8]:
print('\n', 'Applicant')
lstDesc = describeCategory(dfApplicant)
for dfDesc in lstDesc:
print('\n',dfDesc.df_name)
display(HTML(tabulate(dfDesc,headers='keys', tablefmt='html', showindex=False, )))
print('\n', 'Portfolio')
lstDesc = describeCategory(dfPortfolio)
for dfDesc in lstDesc:
print('\n',dfDesc.df_name)
display(HTML(tabulate(dfDesc,headers='keys', tablefmt='html', showindex=False, )))
print('\n', 'Classification')
lstDesc = describeCategory(dfClassification)
for dfDesc in lstDesc:
print('\n',dfDesc.df_name)
display(HTML(tabulate(dfDesc,headers='keys', tablefmt='html', showindex=False, )))
In [9]:
print('\n', 'Portfolio')
lstDesc = describeCategory(dfPortfolio,'Amount')
for dfDesc in lstDesc:
print('\n',dfDesc.df_name)
display(HTML(tabulate(dfDesc,headers='keys', tablefmt='html', showindex=False, )))
In [ ]: